{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d7bdb898-57d4-4fff-95c3-62937342ec1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "003eb85b-d258-4ac7-b771-fe859d4e98ed",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           Pclass         Age       SibSp       Parch        Fare\n",
      "count  891.000000  714.000000  891.000000  891.000000  891.000000\n",
      "mean     2.308642   29.699118    0.523008    0.381594   32.204208\n",
      "std      0.836071   14.526497    1.102743    0.806057   49.693429\n",
      "min      1.000000    0.420000    0.000000    0.000000    0.000000\n",
      "25%      2.000000   20.125000    0.000000    0.000000    7.910400\n",
      "50%      3.000000   28.000000    0.000000    0.000000   14.454200\n",
      "75%      3.000000   38.000000    1.000000    0.000000   31.000000\n",
      "max      3.000000   80.000000    8.000000    6.000000  512.329200\n",
      "                           Name   Sex  Ticket    Cabin Embarked\n",
      "count                       891   891     891      204      889\n",
      "unique                      891     2     681      147        3\n",
      "top     Braund, Mr. Owen Harris  male  347082  B96 B98        S\n",
      "freq                          1   577       7        4      644\n",
      "           Pclass         Age       SibSp       Parch        Fare\n",
      "count  418.000000  332.000000  418.000000  418.000000  417.000000\n",
      "mean     2.265550   30.272590    0.447368    0.392344   35.627188\n",
      "std      0.841838   14.181209    0.896760    0.981429   55.907576\n",
      "min      1.000000    0.170000    0.000000    0.000000    0.000000\n",
      "25%      1.000000   21.000000    0.000000    0.000000    7.895800\n",
      "50%      3.000000   27.000000    0.000000    0.000000   14.454200\n",
      "75%      3.000000   39.000000    1.000000    0.000000   31.500000\n",
      "max      3.000000   76.000000    8.000000    9.000000  512.329200\n",
      "                    Name   Sex    Ticket            Cabin Embarked\n",
      "count                418   418       418               91      418\n",
      "unique               418     2       363               76        3\n",
      "top     Kelly, Mr. James  male  PC 17608  B57 B59 B63 B66        S\n",
      "freq                   1   266         5                3      270\n"
     ]
    }
   ],
   "source": [
    "data = pd.DataFrame(pd.read_csv('train.csv'))\n",
    "data_test = pd.DataFrame(pd.read_csv('test.csv'))\n",
    "data_test = data_test[[\"Pclass\",\"Name\",\"Sex\",\"Age\",\"SibSp\",\"Parch\",\"Ticket\",\"Fare\",\"Cabin\",\"Embarked\"]]\n",
    "x = data[[\"Pclass\",\"Name\",\"Sex\",\"Age\",\"SibSp\",\"Parch\",\"Ticket\",\"Fare\",\"Cabin\",\"Embarked\"]]\n",
    "y = data[[\"Survived\"]]\n",
    "print(x.describe())\n",
    "print(x.describe(include=['O']))\n",
    "print(data_test.describe())\n",
    "print(data_test.describe(include=['O']))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b16b3825-c94e-4e0a-bf33-e45cdce1be23",
   "metadata": {},
   "source": [
    "# 2、数据清洗"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4017244-a013-4ba9-b3de-c0a87fff75ad",
   "metadata": {},
   "source": [
    "## 2、1缺失值处理"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b04af75f-0757-4d40-885a-f39220ed8bc8",
   "metadata": {},
   "source": [
    "Age和Embarked 列存在少量缺失值，分别处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "e4404722-e843-47b6-b568-053617d73603",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 用众数填充缺失值\n",
    "data['Embarked'] = data['Embarked'].fillna('S')\n",
    "# 用平均数填充Age缺失值\n",
    "data[\"Age\"] = data['Age'].fillna(data[\"Age\"].mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0366f834-2de1-496a-8bed-8363682a75b6",
   "metadata": {},
   "source": [
    "## 2.1 删除缺失率较大的列（初步处理）"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7789533b-fda2-480f-8488-60b21e1e252a",
   "metadata": {},
   "source": [
    "Cabin列的缺失率达到了75%,删除列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "27253dd5-e360-4011-972b-d2f913d40822",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.drop([\"Cabin\"], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7032523-8ed7-4555-9d4f-5615e4c4cedc",
   "metadata": {},
   "source": [
    "## 3 特征处理"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "83d9e138-8a87-4530-a94a-89a71d863a34",
   "metadata": {},
   "source": [
    "特征处理是基于具体的数据的，所以在特征处理之前要对数据做充分的理解。特征处理没有固定方法之说，\n",
    "主要靠个人的经验与观察，通过不断的尝试和变换，以期望挖掘出较好的特征变量。所以说，\n",
    "特征处理是模型建立过程中最耗时和耗神的工作"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "feb26f60-9f0d-4429-9179-7bfbc4e18112",
   "metadata": {},
   "source": [
    "### 3.1 单变量特征提取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "441aa1d2-3d8f-40fc-bd33-9578659fbc45",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 根据name的长度,抽象出name_len特征\n",
    "data['name_len'] = data['Name'].apply(len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "1fa83acf-f14e-492c-8fbb-99b356f64b5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data['name_class'] = data[\"Name\"].apply(lambda x : x.split(\",\")[1]).apply(lambda x: x.split()[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6528879c-8d78-43b8-8e8b-5293c1d3f37c",
   "metadata": {},
   "source": [
    "3.2 多变量的组合"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "731d8e97-6c41-4bbf-b821-972d89f06ac8",
   "metadata": {},
   "source": [
    "sibsp 代表兄弟姐妹和配偶的数量\n",
    "parch 代表父母和子女的数量\n",
    "因此 可以将sibsp和parch结合获得家庭成员的数量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "f6e9cab7-677f-43b9-b457-a7d3a778439e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data['family_num'] = data['Parch'] + data['SibSp'] + 1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cfae3032-4f0e-4555-9d07-7f8cbb59a8f6",
   "metadata": {},
   "source": [
    "### 3.3 名义变量转数值变量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "2f46ef6e-70f8-4615-bb10-690100c6b238",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Embarked\n",
    "data['Embarked'] = data['Embarked'].map({'S':1, 'C':2,'Q':3}).astype(int)\n",
    "# Sex\n",
    "data['Sex'] = data['Sex'].apply(lambda x: 0 if x =='male' else 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae5a745f-464b-4b7b-8868-53b95fcdb8e2",
   "metadata": {},
   "source": [
    "### 3.4 数据分段"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0c2f4a9-7a20-4810-8ef4-bff967664c34",
   "metadata": {},
   "source": [
    "根据统计信息和经验分段"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "d335d62d-cc1d-4046-82a3-ccee1fadd18e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cutFeature(datalist:list, x):\n",
    "    for i in range(1,len(datalist)):\n",
    "        if x < datalist[i]:\n",
    "            break\n",
    "    return i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "01dba55a-900e-4272-9be7-59984964a793",
   "metadata": {},
   "outputs": [],
   "source": [
    "#[7.91,14.45,31.0]根据Fare的统计信息进行分段\n",
    "data[\"Fare\"] = data[\"Fare\"].apply(lambda x:cutFeature([7.91,14.45,31.0],x))\n",
    "#[18,48,64]按照经验分段\n",
    "data[\"Age\"] = data[\"Age\"].apply(lambda x:cutFeature([18,48,64],x))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "00d1086b-d6c1-423c-ac92-a4e9d3b73ae0",
   "metadata": {},
   "source": [
    "## 4. 模型选择与测试"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9cb49efb-a722-4699-9d4e-47474ef231e6",
   "metadata": {},
   "source": [
    "RandomForestClassifier\n",
    "\n",
    "ExtraTreesClassifier\n",
    "\n",
    "AdaBoostClassifier\n",
    "\n",
    "GradientBoostingClassifier\n",
    "\n",
    "SVC\n",
    "\n",
    "模型参数："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "1c552d81-cfea-4285-a424-5b6aeddd1c43",
   "metadata": {},
   "outputs": [],
   "source": [
    "#随机森林\n",
    "rf_params = {\n",
    "    'n_jobs': -1,\n",
    "    'n_estimators': 500,\n",
    "    'warm_start': True,\n",
    "    # 'max_features': 0.2,\n",
    "    'max_depth': 6,\n",
    "    'min_samples_leaf': 2,\n",
    "    'max_features': 'sqrt',\n",
    "    'verbose': 0\n",
    "}\n",
    "# Extra Trees 随机森林\n",
    "et_params = {\n",
    "    'n_jobs': -1,\n",
    "    'n_estimators': 500,\n",
    "    # 'max_features': 0.5,\n",
    "    'max_depth': 8,\n",
    "    'min_samples_leaf': 2,\n",
    "    'verbose': 0\n",
    "}\n",
    "\n",
    "# AdaBoost \n",
    "ada_params = {\n",
    "    'n_estimators': 500,\n",
    "    'learning_rate': 0.75\n",
    "}\n",
    "\n",
    "# GBDT\n",
    "gb_params = {\n",
    "    'n_estimators': 500,\n",
    "    # 'max_features': 0.2,\n",
    "    'max_depth': 5,\n",
    "    'min_samples_leaf': 2,\n",
    "    'verbose': 0\n",
    "}\n",
    "\n",
    "# SVC\n",
    "svc_params = {\n",
    "    'kernel': 'linear',\n",
    "    'C': 0.025\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07778ff3-5da5-48c4-bc74-b9d60965f34d",
   "metadata": {},
   "source": [
    "模型选择代码："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "376e5eb8-12f5-49d6-b62d-20eddcecf85b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "630113de-8997-461c-b308-ea29a33c75a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "43ea94cd-5c8f-4193-b5fc-ba973b9d681c",
   "metadata": {},
   "outputs": [
    {
     "ename": "InvalidIndexError",
     "evalue": "(slice(None, None, None), 0)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "File \u001b[1;32m~\\AppData\\Local\\miniconda3\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3791\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m   3790\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 3791\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   3792\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
      "File \u001b[1;32mindex.pyx:152\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "File \u001b[1;32mindex.pyx:158\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mTypeError\u001b[0m: '(slice(None, None, None), 0)' is an invalid key",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[1;31mInvalidIndexError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[49], line 10\u001b[0m\n\u001b[0;32m      8\u001b[0m holdout \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m0.95\u001b[39m, \u001b[38;5;241m0.90\u001b[39m, \u001b[38;5;241m0.75\u001b[39m, \u001b[38;5;241m0.50\u001b[39m, \u001b[38;5;241m0.01\u001b[39m]\n\u001b[0;32m      9\u001b[0m rounds \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m20\u001b[39m\n\u001b[1;32m---> 10\u001b[0m x_train \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m     11\u001b[0m labels_train \u001b[38;5;241m=\u001b[39m  train_np[:,\u001b[38;5;241m1\u001b[39m:]\n\u001b[0;32m     12\u001b[0m xx \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1.\u001b[39m \u001b[38;5;241m-\u001b[39m np\u001b[38;5;241m.\u001b[39marray(holdout)\n",
      "File \u001b[1;32m~\\AppData\\Local\\miniconda3\\Lib\\site-packages\\pandas\\core\\frame.py:3893\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m   3891\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m   3892\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[1;32m-> 3893\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   3894\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[0;32m   3895\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
      "File \u001b[1;32m~\\AppData\\Local\\miniconda3\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3803\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m   3798\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[0;32m   3799\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[0;32m   3800\u001b[0m     \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[0;32m   3801\u001b[0m     \u001b[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[0;32m   3802\u001b[0m     \u001b[38;5;66;03m#  the TypeError.\u001b[39;00m\n\u001b[1;32m-> 3803\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_check_indexing_error\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   3804\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m\n",
      "File \u001b[1;32m~\\AppData\\Local\\miniconda3\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:5975\u001b[0m, in \u001b[0;36mIndex._check_indexing_error\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m   5971\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_check_indexing_error\u001b[39m(\u001b[38;5;28mself\u001b[39m, key):\n\u001b[0;32m   5972\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_scalar(key):\n\u001b[0;32m   5973\u001b[0m         \u001b[38;5;66;03m# if key is not a scalar, directly raise an error (the code below\u001b[39;00m\n\u001b[0;32m   5974\u001b[0m         \u001b[38;5;66;03m# would convert to numpy arrays and raise later any way) - GH29926\u001b[39;00m\n\u001b[1;32m-> 5975\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n",
      "\u001b[1;31mInvalidIndexError\u001b[0m: (slice(None, None, None), 0)"
     ]
    }
   ],
   "source": [
    "classifiers=[\n",
    "    (\"rf_model\", RandomForestClassifier(**rf_params)),\n",
    "    (\"et_model\", ExtraTreesClassifier(**et_params)),\n",
    "    (\"ada_model\", AdaBoostClassifier(**ada_params)),\n",
    "    (\"gb_model\", GradientBoostingClassifier(**gb_params)),\n",
    "    (\"svc_model\", SVC(**svc_params)),\n",
    "]\n",
    "holdout = [0.95, 0.90, 0.75, 0.50, 0.01]\n",
    "rounds = 20\n",
    "x_train = data[:,0]\n",
    "labels_train =  train_np[:,1:]\n",
    "xx = 1. - np.array(holdout)\n",
    "for name, clf in classifiers:\n",
    "    print(\"training %s\" % name)\n",
    "    rng = np.random.RandomState(42)\n",
    "    yy = []\n",
    "    for i in holdout:\n",
    "        yy_ = []\n",
    "        for r in range(rounds):\n",
    "            X_train_turn, X_test_turn, y_train_turn, y_test_turn = \\\n",
    "                train_test_split(x_train, labels_train, test_size=i, random_state=rng)\n",
    "            clf.fit(X_train_turn, y_train_turn)\n",
    "            y_pred = clf.predict(X_test_turn)\n",
    "            yy_.append(1 - np.mean(y_pred == y_test_turn))\n",
    "        yy.append(np.mean(yy_))\n",
    "    plt.plot(xx, yy, label=name)\n",
    "\n",
    "plt.legend(loc=\"upper right\")\n",
    "plt.xlabel(\"Proportion train\")\n",
    "plt.ylabel(\"Test Error Rate\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15548fd3-bb6e-481a-a317-1fd05da14a9b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
